Step 1: Data Setup and Exploration

# ───────────────────────────────────────────────────────────────────────────────
# 0. Install & load required packages
# ───────────────────────────────────────────────────────────────────────────────
pkgs <- c("sf", "dplyr", "readr", "tmap", "spdep", "tigris", "stringr")
to_install <- pkgs[!sapply(pkgs, requireNamespace, quietly = TRUE)]
if(length(to_install)) install.packages(to_install, repos = "https://cloud.r-project.org")
lapply(pkgs, library, character.only = TRUE)
## Linking to GEOS 3.13.0, GDAL 3.8.5, PROJ 9.5.1; sf_use_s2() is TRUE
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
## [[1]]
## [1] "sf"        "stats"     "graphics"  "grDevices" "utils"     "datasets" 
## [7] "methods"   "base"     
## 
## [[2]]
## [1] "dplyr"     "sf"        "stats"     "graphics"  "grDevices" "utils"    
## [7] "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "readr"     "dplyr"     "sf"        "stats"     "graphics"  "grDevices"
##  [7] "utils"     "datasets"  "methods"   "base"     
## 
## [[4]]
##  [1] "tmap"      "readr"     "dplyr"     "sf"        "stats"     "graphics" 
##  [7] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[5]]
##  [1] "spdep"     "spData"    "tmap"      "readr"     "dplyr"     "sf"       
##  [7] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [13] "base"     
## 
## [[6]]
##  [1] "tigris"    "spdep"     "spData"    "tmap"      "readr"     "dplyr"    
##  [7] "sf"        "stats"     "graphics"  "grDevices" "utils"     "datasets" 
## [13] "methods"   "base"     
## 
## [[7]]
##  [1] "stringr"   "tigris"    "spdep"     "spData"    "tmap"      "readr"    
##  [7] "dplyr"     "sf"        "stats"     "graphics"  "grDevices" "utils"    
## [13] "datasets"  "methods"   "base"
options(tigris_use_cache = TRUE)
# ───────────────────────────────────────────────────────────────────────────────
# 1. Read your DMV attribute table & standardize FIPS
# ───────────────────────────────────────────────────────────────────────────────
dmv_data <- read_csv(
  "/Users/g35441498/Desktop/spatial_statistics_final_project-main/dmv_data.csv",
  show_col_types = FALSE
) %>%
  mutate(fips = str_pad(as.character(fips), width = 11, pad = "0"))
# ───────────────────────────────────────────────────────────────────────────────
# 2. Download & bind Census tract boundaries for DC, MD, VA
# ───────────────────────────────────────────────────────────────────────────────
dc_tracts <- tracts(state = "DC", year = 2022, cb = TRUE)
md_tracts <- tracts(state = "MD", year = 2022, cb = TRUE)
va_tracts <- tracts(state = "VA", year = 2022, cb = TRUE)

tracts <- bind_rows(dc_tracts, md_tracts, va_tracts) %>%
  mutate(GEOID = str_pad(GEOID, width = 11, pad = "0"))

# ───────────────────────────────────────────────────────────────────────────────
# 3. Join attributes onto tract geometries
# ───────────────────────────────────────────────────────────────────────────────
dmv_sf <- tracts %>%
  left_join(dmv_data, by = c("GEOID" = "fips"))
# ───────────────────────────────────────────────────────────────────────────────
# 4. Filter & map asthma prevalence
# ───────────────────────────────────────────────────────────────────────────────
dmv_sf_clean <- dmv_sf %>% filter(!is.na(asthma_prev))

tmap_mode("plot")
## ℹ tmap mode set to "plot".
tm_shape(dmv_sf_clean) +
  tm_polygons(
    "asthma_prev",
    palette      = "Reds",
    style        = "quantile",
    border.alpha = 0.4,
    title        = "Asthma Prev (%)"
  ) +
  tm_layout(title = "Asthma Prevalence in DMV Tracts")
## 
## ── tmap v3 code detected ───────────────────────────────────────────────────────
## [v3->v4] `tm_polygons()`: instead of `style = "quantile"`, use fill.scale =
## `tm_scale_intervals()`.
## ℹ Migrate the argument(s) 'style', 'palette' (rename to 'values') to
##   'tm_scale_intervals(<HERE>)'[v3->v4] `tm_polygons()`: use `col_alpha` instead of `border.alpha`.[v3->v4] `tm_polygons()`: migrate the argument(s) related to the legend of the
## visual variable `fill` namely 'title' to 'fill.legend = tm_legend(<HERE>)'[v3->v4] `tm_layout()`: use `tm_title()` instead of `tm_layout(title = )`[cols4all] color palettes: use palettes from the R package cols4all. Run
## `cols4all::c4a_gui()` to explore them. The old palette name "Reds" is named
## "brewer.reds"Multiple palettes called "reds" found: "brewer.reds", "matplotlib.reds". The first one, "brewer.reds", is returned.

# ───────────────────────────────────────────────────────────────────────────────
# 5. Build Queen contiguity neighbors & spatial weights
# ───────────────────────────────────────────────────────────────────────────────
nb <- poly2nb(dmv_sf_clean, queen = TRUE)
lw <- nb2listw(nb, style = "W", zero.policy = TRUE)

# 6. Plot tract boundaries + neighbor links
plot(
  st_geometry(dmv_sf_clean),
  border = "lightgrey", reset = FALSE,
  main   = "DMV Tracts & Queen Neighbors"
)
centroids <- st_coordinates(st_centroid(dmv_sf_clean))
## Warning: st_centroid assumes attributes are constant over geometries
plot(nb, coords = centroids, add = TRUE, col = "blue")

Step 2: Spatial Autocorrelation

# Global Moran’s I test for asthma prevalence
moran_res <- spdep::moran.test(
  dmv_sf_clean$asthma_prev,
  lw,
  zero.policy = TRUE
)
print(moran_res)
## 
##  Moran I test under randomisation
## 
## data:  dmv_sf_clean$asthma_prev  
## weights: lw    
## 
## Moran I statistic standard deviate = 74.68, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##      7.139356e-01     -2.611648e-04      9.145934e-05
# Moran scatterplot
spdep::moran.plot(
  dmv_sf_clean$asthma_prev,
  lw,
  zero.policy = TRUE,
  labels = FALSE,
  pch    = 16,
  main   = "Moran Scatterplot: Asthma Prevalence"
)

# Interpretation:
# - Look at Moran’s I statistic and its p-value in moran_res.
# - A significantly positive Moran’s I (p < 0.05) indicates spatial clustering 
#   of high (or low) asthma prevalence, justifying spatial regression models.

#Step 3: variables choosing

# ---- 0.  Packages ------------------------------------------------------------
library(dplyr)
library(tidyr)
library(ggplot2)
library(broom)          # tidy() for model output
library(psych)          # describe(), corr.test()
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(ggcorrplot)     # prettier correlation heat-map

# -----------------------------------------------------------------
# 0) Define variable groups for clarity & easy sub-setting
# -----------------------------------------------------------------

## 0a. Pollution & physical environment
vars_pollute <- c("pm2.5", "ozone", "diesel", "road_proximity",
                  "percent_in_flood_zone")      # flood exposure

## 0b. Socio-economic status (SES & financial stress)
vars_ses <- c("housing_cost_burden", "poverty", "unemployment")

## 0c. Education & language barriers
vars_edu_lang <- c("no_highschool", "limited_english")

## 0d. Household crowding & caregiving
vars_household <- c("crowded_housing", "single_parent")

## 0e. Age structure
vars_age <- c("age_under_17", "age_over_65")

## 0f. Racial/ethnic composition
vars_race <- c("minority")   # can add more like 'black', 'hispanic' if needed later

## 0g. Outcome
var_outcome <- "asthma_prev"


# ---------------------------------------------------------------
# Combine everything into ONE master vector for the model step
# ---------------------------------------------------------------
vars_all <- c(var_outcome,
              vars_pollute,
              vars_ses,
              vars_edu_lang,
              vars_household,
              vars_age,
              vars_race)
# ---------------------------------------------------------------
# 1) Subset and drop rows with any missing values in vars_all
# ---------------------------------------------------------------
model_df <- dmv_data %>%
  dplyr::select(all_of(vars_all)) %>%
  tidyr::drop_na()          # geometry stays intact if dmv_data is an sf object
## 2a. Spearman correlations with asthma_prev -------------------------------
cont_vars  <- setdiff(names(model_df), "asthma_prev")

cor_out <- psych::corr.test(model_df[, c("asthma_prev", cont_vars)],
                             method = "spearman", adjust = "BH")

# neat heat-map (upper triangle) -------------------------------------------
ggcorrplot(cor_out$r,
           hc.order = TRUE,            # cluster similar variables
           type      = "upper",
           lab       = TRUE,
           title     = "Spearman correlations with asthma prevalence")

# ---- Packages ---------------------------------------------------------------
if (!require(car)) install.packages("car")
## Loading required package: car
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
## The following object is masked from 'package:dplyr':
## 
##     recode
library(car)          # for vif()

# ---- Trimmed-variable linear model -----------------------------------------
lm_trim <- lm(
  asthma_prev ~ pm2.5 + ozone + diesel + road_proximity + 
                percent_in_flood_zone +
                housing_cost_burden + poverty + unemployment +
                no_highschool + limited_english +
                crowded_housing + single_parent +
                age_under_17 + age_over_65,
  data = dmv_data
)

# ---- Re-estimate VIFs -------------------------------------------------------
vif_vals <- vif(lm_trim)
print(vif_vals)
##                 pm2.5                 ozone                diesel 
##              3.795824              2.138744              3.189733 
##        road_proximity percent_in_flood_zone   housing_cost_burden 
##              1.450347              1.158879              2.704069 
##               poverty          unemployment         no_highschool 
##              2.895095              1.405538              2.225353 
##       limited_english       crowded_housing         single_parent 
##              2.395978              1.946115              2.055992 
##          age_under_17           age_over_65 
##              1.828245              1.382131
# optional: flag any values > 4
vif_vals[vif_vals > 4]
## named numeric(0)

Step 4: Linear Regression Model

library(dplyr)
library(spdep)
library(tidyr)

# -------------------------------------------------------------------
# 1) Define variables for the final OLS / spatial-diagnostic workflow
# -------------------------------------------------------------------
vars_selected <- c(
  "asthma_prev",
  "pm2.5", "ozone", "road_proximity", "percent_in_flood_zone",
  "poverty", "crowded_housing", "limited_english",
  "age_under_17", "minority"
)

# -------------------------------------------------------------------
# 2) Keep only tracts with complete data on vars
# -------------------------------------------------------------------
model_data <- dmv_sf_clean %>%
  dplyr::select(all_of(vars_selected)) %>%   # keep geometry
  tidyr::drop_na()

# -------------------------------------------------------------------
# 3) Build neighbors & spatial weights on the SAME subset
# -------------------------------------------------------------------
nb_mod <- spdep::poly2nb(model_data, queen = TRUE)
## Warning in spdep::poly2nb(model_data, queen = TRUE): some observations have no neighbours;
## if this seems unexpected, try increasing the snap argument.
## Warning in spdep::poly2nb(model_data, queen = TRUE): neighbour object has 16 sub-graphs;
## if this sub-graph count seems unexpected, try increasing the snap argument.
lw_mod <- spdep::nb2listw(nb_mod, style = "W", zero.policy = TRUE)

# -------------------------------------------------------------------
# 4) Fit ordinary least-squares (OLS) model
# -------------------------------------------------------------------
ols_model <- lm(
  asthma_prev ~ pm2.5 + ozone + road_proximity + percent_in_flood_zone +
                poverty + crowded_housing + limited_english +
                age_under_17 + minority,
  data = model_data
)
summary(ols_model)    # inspect coefficients
## 
## Call:
## lm(formula = asthma_prev ~ pm2.5 + ozone + road_proximity + percent_in_flood_zone + 
##     poverty + crowded_housing + limited_english + age_under_17 + 
##     minority, data = model_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.4385 -0.5649 -0.0220  0.5356  5.5714 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            1.125e+01  4.019e-01  27.984  < 2e-16 ***
## pm2.5                 -1.529e-01  4.737e-02  -3.229  0.00126 ** 
## ozone                  1.686e-01  1.747e-02   9.652  < 2e-16 ***
## road_proximity        -5.431e-04  5.034e-04  -1.079  0.28076    
## percent_in_flood_zone -4.908e-04  1.517e-03  -0.324  0.74624    
## poverty                1.782e-03  4.532e-05  39.318  < 2e-16 ***
## crowded_housing       -1.317e-04  5.042e-04  -0.261  0.79395    
## limited_english       -2.268e-03  1.338e-04 -16.949  < 2e-16 ***
## age_under_17          -9.877e-04  4.753e-05 -20.778  < 2e-16 ***
## minority               2.607e-04  1.849e-05  14.103  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9432 on 2509 degrees of freedom
## Multiple R-squared:  0.514,  Adjusted R-squared:  0.5123 
## F-statistic: 294.9 on 9 and 2509 DF,  p-value: < 2.2e-16
# -------------------------------------------------------------------
# 5) Attach residuals to the sf data frame
# -------------------------------------------------------------------
model_data$resid_ols <- residuals(ols_model)

# -------------------------------------------------------------------
# 6) Global Moran’s I test on the OLS residuals
# -------------------------------------------------------------------
resid_moran <- spdep::moran.test(
  model_data$resid_ols,
  lw_mod,
  zero.policy = TRUE
)
print(resid_moran)
## 
##  Moran I test under randomisation
## 
## data:  model_data$resid_ols  
## weights: lw_mod  
## n reduced by no-neighbour observations  
## 
## Moran I statistic standard deviate = 34.266, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##      0.4634956078     -0.0003984064      0.0001832732
# -------------------------------------------------------------------
# 7) Moran scatterplot of OLS residuals
# -------------------------------------------------------------------
spdep::moran.plot(
  model_data$resid_ols,
  lw_mod,
  zero.policy = TRUE,
  labels = FALSE,
  pch    = 16,
  main   = "Moran Scatterplot: OLS Residuals"
)

Step 5: SLM SEM Regression

library(spatialreg)   # for lagsarlm() and errorsarlm()
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Attaching package: 'spatialreg'
## The following objects are masked from 'package:spdep':
## 
##     get.ClusterOption, get.coresOption, get.mcOption,
##     get.VerboseOption, get.ZeroPolicyOption, set.ClusterOption,
##     set.coresOption, set.mcOption, set.VerboseOption,
##     set.ZeroPolicyOption
library(spdep)        # for moran.test()

# ───────────────────────────────────────────────────────────────────────────────
# 4.1 Spatial Lag Model (SLM)
# ───────────────────────────────────────────────────────────────────────────────
slm_model <- lagsarlm(
  asthma_prev ~ pm2.5 + ozone + road_proximity + percent_in_flood_zone +
                poverty + crowded_housing + limited_english +
                age_under_17 + minority,
  data = model_data,
  listw = lw_mod,
  zero.policy = TRUE
)
summary(slm_model)
## 
## Call:lagsarlm(formula = asthma_prev ~ pm2.5 + ozone + road_proximity + 
##     percent_in_flood_zone + poverty + crowded_housing + limited_english + 
##     age_under_17 + minority, data = model_data, listw = lw_mod, 
##     zero.policy = TRUE)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -3.887119 -0.419133 -0.040171  0.363403  6.753178 
## 
## Type: lag 
## Regions with no neighbours included:
##  18 267 381 569 781 929 1785 2138 
## Coefficients: (asymptotic standard errors) 
##                          Estimate  Std. Error  z value  Pr(>|z|)
## (Intercept)            5.9620e+00  3.7195e-01  16.0288 < 2.2e-16
## pm2.5                 -6.5100e-02  3.8691e-02  -1.6825   0.09246
## ozone                  1.0794e-01  1.4364e-02   7.5143 5.729e-14
## road_proximity        -8.9647e-04  4.0981e-04  -2.1875   0.02870
## percent_in_flood_zone -1.6111e-03  1.2331e-03  -1.3065   0.19137
## poverty                1.3158e-03  3.9117e-05  33.6387 < 2.2e-16
## crowded_housing        2.9583e-04  4.1002e-04   0.7215   0.47059
## limited_english       -1.4121e-03  1.1042e-04 -12.7884 < 2.2e-16
## age_under_17          -6.1044e-04  4.0211e-05 -15.1807 < 2.2e-16
## minority               1.5874e-04  1.5246e-05  10.4117 < 2.2e-16
## 
## Rho: 0.44343, LR test value: 910.94, p-value: < 2.22e-16
## Asymptotic standard error: 0.014186
##     z-value: 31.259, p-value: < 2.22e-16
## Wald statistic: 977.13, p-value: < 2.22e-16
## 
## Log likelihood: -2966.586 for lag model
## ML residual variance (sigma squared): 0.58819, (sigma: 0.76694)
## Number of observations: 2519 
## Number of parameters estimated: 12 
## AIC: 5957.2, (AIC for lm: 6866.1)
## LM test for residual autocorrelation
## test value: 113.82, p-value: < 2.22e-16
# ───────────────────────────────────────────────────────────────────────────────
# 4.2 Spatial Error Model (SEM)
# ───────────────────────────────────────────────────────────────────────────────
sem_modelel <- errorsarlm(
  asthma_prev ~ pm2.5 + ozone + road_proximity + percent_in_flood_zone +
                poverty + crowded_housing + limited_english +
                age_under_17 + minority,
  data = model_data,
  listw = lw_mod,
  zero.policy = TRUE
)
summary(sem_modelel)
## 
## Call:errorsarlm(formula = asthma_prev ~ pm2.5 + ozone + road_proximity + 
##     percent_in_flood_zone + poverty + crowded_housing + limited_english + 
##     age_under_17 + minority, data = model_data, listw = lw_mod, 
##     zero.policy = TRUE)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -3.238607 -0.359284 -0.042705  0.300107  6.130692 
## 
## Type: error 
## Regions with no neighbours included:
##  18 267 381 569 781 929 1785 2138 
## Coefficients: (asymptotic standard errors) 
##                          Estimate  Std. Error  z value  Pr(>|z|)
## (Intercept)            1.1581e+01  8.4766e-01  13.6625 < 2.2e-16
## pm2.5                 -1.7084e-01  9.9495e-02  -1.7171  0.085958
## ozone                  1.1450e-01  3.5950e-02   3.1850  0.001447
## road_proximity         3.1556e-04  5.4295e-04   0.5812  0.561105
## percent_in_flood_zone -2.3233e-03  1.4176e-03  -1.6388  0.101247
## poverty                1.1221e-03  3.8071e-05  29.4738 < 2.2e-16
## crowded_housing        7.2374e-04  3.5048e-04   2.0650  0.038922
## limited_english       -7.7913e-04  1.1817e-04  -6.5931 4.306e-11
## age_under_17          -5.0756e-04  4.1256e-05 -12.3025 < 2.2e-16
## minority               1.1000e-04  1.9291e-05   5.7020 1.184e-08
## 
## Lambda: 0.74488, LR test value: 1265.8, p-value: < 2.22e-16
## Asymptotic standard error: 0.013995
##     z-value: 53.226, p-value: < 2.22e-16
## Wald statistic: 2833, p-value: < 2.22e-16
## 
## Log likelihood: -2789.149 for error model
## ML residual variance (sigma squared): 0.45401, (sigma: 0.6738)
## Number of observations: 2519 
## Number of parameters estimated: 12 
## AIC: 5602.3, (AIC for lm: 6866.1)
# ───────────────────────────────────────────────────────────────────────────────
# 4.3 Compare AIC across models
# ───────────────────────────────────────────────────────────────────────────────
aic_ols <- AIC(ols_model)
aic_slm <- AIC(slm_model)
aic_sem <- AIC(sem_modelel)

cat("AIC - OLS:", aic_ols, "\n",
    "AIC - Spatial Lag:", aic_slm, "\n",
    "AIC - Spatial Error:", aic_sem, "\n")
## AIC - OLS: 6866.108 
##  AIC - Spatial Lag: 5957.172 
##  AIC - Spatial Error: 5602.297
# ───────────────────────────────────────────────────────────────────────────────
# 4.4 Compute Pseudo-R² for each model
# ───────────────────────────────────────────────────────────────────────────────
pseudoR2 <- function(resid, y) {
  sse <- sum(resid^2)
  sst <- sum((y - mean(y))^2)
  1 - (sse / sst)
}

r2_ols <- pseudoR2(residuals(ols_model), model_data$asthma_prev)
r2_slm <- pseudoR2(residuals(slm_model), model_data$asthma_prev)
r2_sem <- pseudoR2(residuals(sem_modelel), model_data$asthma_prev)

cat("Pseudo-R² - OLS:", r2_ols, "\n",
    "Pseudo-R² - Spatial Lag:", r2_slm, "\n",
    "Pseudo-R² - Spatial Error:", r2_sem, "\n")
## Pseudo-R² - OLS: 0.5140215 
##  Pseudo-R² - Spatial Lag: 0.6774207 
##  Pseudo-R² - Spatial Error: 0.7510107
# ───────────────────────────────────────────────────────────────────────────────
# 4.5 Moran’s I on residuals for each model
# ───────────────────────────────────────────────────────────────────────────────
moran_ols <- moran.test(residuals(ols_model), lw_mod, zero.policy = TRUE)
moran_slm <- moran.test(residuals(slm_model), lw_mod, zero.policy = TRUE)
moran_sem <- moran.test(residuals(sem_modelel), lw_mod, zero.policy = TRUE)

cat("Moran's I p-value - OLS Residuals:", moran_ols$p.value, "\n",
    "Moran's I p-value - Lag Residuals:", moran_slm$p.value, "\n",
    "Moran's I p-value - Error Residuals:", moran_sem$p.value, "\n")
## Moran's I p-value - OLS Residuals: 1.240044e-257 
##  Moran's I p-value - Lag Residuals: 1.227379e-17 
##  Moran's I p-value - Error Residuals: 1

#Step 6 Car model

library(spdep)        # spautolm() for CAR
library(spatialreg)   # moran.test()

# ── 1·1  Fit the CAR model (Gaussian response) ────────────────────────────────
car_model <- spautolm(
 asthma_prev ~ pm2.5 + ozone + road_proximity + percent_in_flood_zone +
                poverty + crowded_housing + limited_english +
                age_under_17 + minority,
  data   = model_data,
  listw  = lw_mod,
  family = "CAR",          # <-- conditional autoregressive
  zero.policy = TRUE
)
## Warning in spautolm(asthma_prev ~ pm2.5 + ozone + road_proximity +
## percent_in_flood_zone + : Non-symmetric spatial weights in CAR model
## Warning in sqrt(fdHess[1, 1]): NaNs produced
summary(car_model)
## 
## Call: spautolm(formula = asthma_prev ~ pm2.5 + ozone + road_proximity + 
##     percent_in_flood_zone + poverty + crowded_housing + limited_english + 
##     age_under_17 + minority, data = model_data, listw = lw_mod, 
##     family = "CAR", zero.policy = TRUE)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -3.964024 -0.441604 -0.025786  0.377974  5.879373 
## 
## Regions with no neighbours included:
##  18 267 381 569 781 929 1785 2138 
## 
## Coefficients: 
##                          Estimate  Std. Error  z value  Pr(>|z|)
## (Intercept)            9.3276e+00  5.4330e-01  17.1687 < 2.2e-16
## pm2.5                  4.4551e-02  6.4223e-02   0.6937 0.4878701
## ozone                  5.6472e-02  2.4136e-02   2.3398 0.0192963
## road_proximity        -3.9400e-03  5.6148e-04  -7.0172 2.264e-12
## percent_in_flood_zone  1.4221e-02  1.6010e-03   8.8822 < 2.2e-16
## poverty                1.3377e-03  4.5656e-05  29.2997 < 2.2e-16
## crowded_housing       -2.3843e-04  4.5277e-04  -0.5266 0.5984644
## limited_english       -2.0298e-03  1.3684e-04 -14.8329 < 2.2e-16
## age_under_17          -1.6299e-04  4.8268e-05  -3.3767 0.0007336
## minority               2.0228e-04  2.0867e-05   9.6942 < 2.2e-16
## 
## Lambda: 0.65445 LR test value: 488.12 p-value: < 2.22e-16 
## Numerical Hessian standard error of lambda: NaN 
## 
## Log likelihood: -3177.995 
## ML residual variance (sigma squared): 0.68789, (sigma: 0.82939)
## Number of observations: 2519 
## Number of parameters estimated: 12 
## AIC: 6380
# ── 1·2  Compare information criteria (lower = better) ────────────────────────
AICvals <- c(
  OLS   = AIC(ols_model),
  SLM   = AIC(slm_model),
  SEM   = AIC(sem_modelel),
  CAR   = AIC(car_model)
)
print(round(AICvals, 1))
##    OLS    SLM    SEM    CAR 
## 6866.1 5957.2 5602.3 6380.0
# ── 1·3  Pseudo-R² for CAR (same formula as before) ───────────────────────────
pseudoR2 <- function(resid, y) 1 - sum(resid^2)/sum((y - mean(y))^2)
r2_car <- pseudoR2(residuals(car_model), model_data$asthma_prev)
print(r2_car)
## [1] 0.6834214
# ── 1·4  Residual Moran’s I to check remaining autocorrelation ────────────────
moran_car <- moran.test(residuals(car_model), lw_mod, zero.policy = TRUE)
cat("CAR residual Moran p-value:", moran_car$p.value, "\n")
## CAR residual Moran p-value: 0.997895

#Step7: SDEM model

library(spatialreg)
library(Matrix)

# ───────────────────────────────────────────────────────────────────────────────
# 1. Fit SEM & SDEM 
# ───────────────────────────────────────────────────────────────────────────────
sem_model <- errorsarlm(
 asthma_prev ~ pm2.5 + ozone + road_proximity + percent_in_flood_zone +
                poverty + crowded_housing + limited_english +
                age_under_17 + minority,
  data        = model_data,
  listw       = lw_mod,
  Durbin      = FALSE,
  zero.policy = TRUE
)

sdem_model <- errorsarlm(
 asthma_prev ~ pm2.5 + ozone + road_proximity + percent_in_flood_zone +
                poverty + crowded_housing + limited_english +
                age_under_17 + minority,
  data        = model_data,
  listw       = lw_mod,
  Durbin      = TRUE,
  zero.policy = TRUE
)

# ───────────────────────────────────────────────────────────────────────────────
# 2. Likelihood-ratio test (SDEM vs SEM)
# ───────────────────────────────────────────────────────────────────────────────
# Perform likelihood-ratio test
lr_res <- anova(sdem_model, sem_model)
print(lr_res)
##            Model df    AIC  logLik Test L.Ratio p-value
## sdem_model     1 21 5455.8 -2706.9    1                
## sem_model      2 12 5602.3 -2789.2    2  164.48       0
# ───────────────────────────────────────────────────────────────────────────────
# 3. Hausman test (with fallback to pseudo-inverse)
# ───────────────────────────────────────────────────────────────────────────────
haus_fun <- function(big, small, tol = 1e-10) {
  out <- tryCatch(
    spatialreg::Hausman.test(big, small, tol = tol),
    error = function(e) {
      # Manual computation if Hausman.test fails
      β_big   <- coef(big)
      β_small <- coef(small)[names(β_big)]  # Align coefficients
      V_big   <- vcov(big)
      V_small <- vcov(small)[names(β_big), names(β_big)]
      
      V_diff  <- V_small - V_big  # Note: Order matters (V_small - V_big)
      
      # Use pseudo-inverse if matrix is near-singular
      V_diff_inv <- try(solve(V_diff), silent = TRUE)
      if (inherits(V_diff_inv, "try-error")) {
        V_diff_inv <- Matrix::ginv(V_diff)
      }
      
      q_diff <- β_big - β_small
      chi    <- t(q_diff) %*% V_diff_inv %*% q_diff
      p_val  <- pchisq(chi, df = length(β_big), lower.tail = FALSE)
      
      list(statistic = chi, parameter = length(β_big), p.value = p_val)
    }
  )
  return(out)
}

haus_res <- haus_fun(sdem_model, sem_model)
## Error in solve.default((Vo - Vs), tol = tol) : 
##   system is computationally singular: reciprocal condition number = 1.39971e-11
## Warning in Hausman.test.Sarlm(big, small, tol = tol): (Vo - Vs) inversion
## failure
# Get log-likelihoods
ll_sdem <- as.numeric(logLik(sdem_model))
ll_sem <- as.numeric(logLik(sem_model))

# Calculate test statistic
LR_stat <- 2 * (ll_sdem - ll_sem)
df_diff <- length(coef(sdem_model)) - length(coef(sem_model))

# Get p-value
p_value <- 1 - pchisq(LR_stat, df_diff)

# Create results
lr_res <- list(
  statistic = LR_stat,
  parameter = df_diff,
  p.value = p_value,
  method = "Likelihood ratio test for nested spatial models"
)
class(lr_res) <- "htest"
print(lr_res)
## 
##  Likelihood ratio test for nested spatial models
## 
## data:  
## = 164.48, = 9, p-value < 2.2e-16
# ───────────────────────────────────────────────────────────────────────────────
# 4. Model comparison table
# ───────────────────────────────────────────────────────────────────────────────
haus_p <- as.numeric(haus_res$p.value)     # force numeric
lr_p   <- as.numeric(lr_res$p.value)       # you did this already

comp <- data.frame(
  Model     = c("SEM", "SDEM"),
  AIC       = c(AIC(sem_model),  AIC(sdem_model)),
  LogLik    = c(logLik(sem_model), logLik(sdem_model)),
  LR_p      = c(NA, round(lr_p,   4)),
  Hausman_p = c(NA, round(haus_p, 4))
)
print(comp, row.names = FALSE)
##  Model      AIC    LogLik LR_p Hausman_p
##    SEM 5602.297 -2789.149   NA        NA
##   SDEM 5455.814 -2706.907    0        NA
cat("\nDecision rules:\n",
    "• LR_p < 0.05 ⇒ SDEM fits significantly better → prefer SDEM.\n",
    "• Hausman_p < 0.05 ⇒ SEM is inconsistent → prefer SDEM.\n",
    "• If both p > 0.05 and SEM has lower AIC, prefer SEM.\n")
## 
## Decision rules:
##  • LR_p < 0.05 ⇒ SDEM fits significantly better → prefer SDEM.
##  • Hausman_p < 0.05 ⇒ SEM is inconsistent → prefer SDEM.
##  • If both p > 0.05 and SEM has lower AIC, prefer SEM.
# ───────────────────────────────────────────────────────────────────────────────
# STEP 4 :  Compare all candidate models and pick the “winner”
# ───────────────────────────────────────────────────────────────────────────────
# Assumes you already have these fitted objects in memory:
#   lm_mod     – ordinary least-squares
#   slm_model  – spatial lag model
#   sem_model    – spatial error model   (or sem_modelel2 in earlier code)
#   sdem_model   – spatial Durbin-error model
#   car_model  – conditional autoregressive model   (optional; omit if not fitted)

# 4·1  Gather and rank by AIC
models <- list(
  OLS  = ols_model,
  SLM  = slm_model,
  SEM  = sem_model,
  SDEM = sdem_model,
  CAR  = car_model   # remove this line if you did not fit CAR
)

aic_tbl <- data.frame(
  Model = names(models),
  AIC   = sapply(models, AIC, USE.NAMES = FALSE),
  LogLik= sapply(models, logLik, USE.NAMES = FALSE)
) |>
  dplyr::arrange(AIC)

print(aic_tbl, row.names = FALSE)
##  Model      AIC    LogLik
##   SDEM 5455.814 -2706.907
##    SEM 5602.297 -2789.149
##    SLM 5957.172 -2966.586
##    CAR 6379.991 -3177.995
##    OLS 6866.108 -3422.054

#Step 8: visualization

##residual

# Attach predicted values and residuals from SEM model
model_data$pred_sem  <- predict(sem_model)
## This method assumes the response is known - see manual page
model_data$resid_sem <- residuals(sem_model)
library(tmap)
tmap_mode("plot")
## ℹ tmap mode set to "plot".
# ---------------------------------------------------------------
# 1 & 2. Observed and Predicted maps side-by-side
# ---------------------------------------------------------------
tmap_arrange(
  tm_shape(model_data) +
    tm_polygons("asthma_prev", palette = "Reds", title = "Observed Asthma") +
    tm_layout(title = "Observed Asthma"),
  
  tm_shape(model_data) +
    tm_polygons("pred_sem", palette = "Blues", title = "Predicted (SEM)") +
    tm_layout(title = "Predicted (SEM)"),
  
  ncol = 2
)
## 
## ── tmap v3 code detected ───────────────────────────────────────────────────────
## [v3->v4] `tm_tm_polygons()`: migrate the argument(s) related to the scale of
## the visual variable `fill` namely 'palette' (rename to 'values') to fill.scale
## = tm_scale(<HERE>).[v3->v4] `tm_polygons()`: migrate the argument(s) related to the legend of the
## visual variable `fill` namely 'title' to 'fill.legend = tm_legend(<HERE>)'[v3->v4] `tm_layout()`: use `tm_title()` instead of `tm_layout(title = )`[v3->v4] `tm_polygons()`: migrate the argument(s) related to the legend of the
## visual variable `fill` namely 'title' to 'fill.legend = tm_legend(<HERE>)'[v3->v4] `tm_layout()`: use `tm_title()` instead of `tm_layout(title = )`[cols4all] color palettes: use palettes from the R package cols4all. Run
## `cols4all::c4a_gui()` to explore them. The old palette name "Reds" is named
## "brewer.reds"Multiple palettes called "reds" found: "brewer.reds", "matplotlib.reds". The first one, "brewer.reds", is returned.
## [cols4all] color palettes: use palettes from the R package cols4all. Run
## `cols4all::c4a_gui()` to explore them. The old palette name "Blues" is named
## "brewer.blues"Multiple palettes called "blues" found: "brewer.blues", "matplotlib.blues". The first one, "brewer.blues", is returned.

# ---------------------------------------------------------------
# 3. Residuals map
# ---------------------------------------------------------------
tm_shape(model_data) +
  tm_polygons("resid_sem", palette = "-RdBu", style = "quantile",
              title = "Residuals (SEM)") +
  tm_layout(title = "Residuals (SEM)")
## 
## ── tmap v3 code detected ───────────────────────────────────────────────────────
## [v3->v4] `tm_polygons()`: instead of `style = "quantile"`, use fill.scale =
## `tm_scale_intervals()`.
## ℹ Migrate the argument(s) 'style', 'palette' (rename to 'values') to
##   'tm_scale_intervals(<HERE>)'[v3->v4] `tm_polygons()`: migrate the argument(s) related to the legend of the
## visual variable `fill` namely 'title' to 'fill.legend = tm_legend(<HERE>)'[v3->v4] `tm_layout()`: use `tm_title()` instead of `tm_layout(title = )`Multiple palettes called "rd_bu" found: "brewer.rd_bu", "matplotlib.rd_bu". The first one, "brewer.rd_bu", is returned.
## [scale] tm_polygons:() the data variable assigned to 'fill' contains positive and negative values, so midpoint is set to 0. Set 'midpoint = NA' in 'fill.scale = tm_scale_intervals(<HERE>)' to use all visual values (e.g. colors)
## [cols4all] color palettes: use palettes from the R package cols4all. Run
## `cols4all::c4a_gui()` to explore them. The old palette name "-RdBu" is named
## "rd_bu" (in long format "brewer.rd_bu")

##model comparison

# 6 Coefficient dot plot (direct effects only)
library(broom)
coef_df <- tidy(sdem_model, conf.int = TRUE)
ggplot(coef_df, aes(estimate, term)) +
  geom_pointrange(aes(xmin=conf.low, xmax=conf.high)) +
  geom_vline(xintercept = 0, linetype="dotted") +
  labs(x="Estimate (95% CI)", y=NULL, title="SDEM Coefficients") +
  theme_bw()

# 7 AIC bar plot
ggplot(aic_tbl, aes(reorder(Model, AIC), AIC)) +
  geom_col(fill = "steelblue") +
  geom_text(aes(label = round(AIC,1)), vjust = -0.3) +
  labs(y="AIC", x=NULL, title="Model Fit Comparison (lower is better)") +
  theme_minimal()